{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "import gym\n",
    "import time\n",
    "import numpy as np \n",
    "import sys\n",
    "sys.path.append(\"/home/wzy/reinforcementLearning/project/gym-master/gym/envs/toy_text\") \n",
    "from cliffwalking import CliffWalkingEnv"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "导入相关包，并且通过本地文件创建悬崖环境"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "class qlearningAgent(object):\n",
    "    def __init__(self, obs_n, act_n, learning_rate=0.01, gamma=0.9, e_greed=0.1):\n",
    "        self.act_n = act_n      # 动作维度，有几个动作可选\n",
    "        self.lr = learning_rate # 学习率\n",
    "        self.gamma = gamma      # reward的衰减率\n",
    "        self.epsilon = e_greed  # 按一定概率随机选动作\n",
    "        self.Q = np.zeros((obs_n, act_n))\n",
    "\n",
    "    # 根据输入观察值，采样输出的动作值，带探索\n",
    "    def sample(self, obs):\n",
    "        if np.random.uniform(0, 1) < (1.0 - self.epsilon): #根据table的Q值选动作\n",
    "            action = self.predict(obs)\n",
    "        else:\n",
    "            action = np.random.choice(self.act_n) #有一定概率随机探索选取一个动作\n",
    "        return action\n",
    "\n",
    "    # 根据输入观察值，预测输出的动作值\n",
    "    def predict(self, obs):\n",
    "        Q_list = self.Q[obs, :]\n",
    "        maxQ = np.max(Q_list)\n",
    "        action_list = np.where(Q_list == maxQ)[0]  # maxQ可能对应多个action\n",
    "        action = np.random.choice(action_list)\n",
    "        return action\n",
    "\n",
    "    # 学习方法，也就是更新Q-table的方法\n",
    "    def learn(self, obs, action, reward, next_obs, done):\n",
    "        predict_Q = self.Q[obs, action]\n",
    "        if done:\n",
    "            target_Q = reward # 没有下一个状态了\n",
    "        else:\n",
    "            target_Q = reward + self.gamma * np.max(self.Q[next_obs, :]) # Q-learning\n",
    "        self.Q[obs, action] += self.lr * (target_Q - predict_Q) # 修正q\n",
    "\n",
    "    # 把 Q表格 的数据保存到文件中\n",
    "    def save(self):\n",
    "        npy_file = './qlist.npy'\n",
    "        np.save(npy_file, self.Q)\n",
    "        print(npy_file + ' saved.')\n",
    "        \n",
    "    # 从文件中读取数据到 Q表格\n",
    "    def restore(self, npy_file='./qlist.npy'):\n",
    "        self.Q = np.load(npy_file)\n",
    "        print(npy_file + ' loaded.')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "构建qlearning的类，实现主要功能"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "def run_episode(env, agent, render=False):\n",
    "    total_steps = 0 # 记录每个episode的step的数量\n",
    "    total_reward = 0\n",
    "\n",
    "    obs = env.reset() # 环境刷新\n",
    "\n",
    "    while True:\n",
    "        action = agent.sample(obs) # 根据算法选择一个动作\n",
    "        next_obs, reward, done, _ = env.step(action) # 与环境进行一个交互\n",
    "        # 训练 Q-learning算法\n",
    "        agent.learn(obs, action, reward, next_obs, done)\n",
    "        obs = next_obs  # 存储上一个观察值\n",
    "        total_reward += reward\n",
    "        total_steps += 1 # 计算step数\n",
    "        if render:\n",
    "            env.render() #渲染新的一帧图形\n",
    "        if done:\n",
    "            break\n",
    "    return total_reward, total_steps"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "训练函数"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "def test_episode(env, agent):\n",
    "    total_reward = 0\n",
    "    obs = env.reset()\n",
    "    while True:\n",
    "        action = agent.predict(obs) # greedy\n",
    "        next_obs, reward, done, _ = env.step(action)\n",
    "        total_reward += reward\n",
    "        obs = next_obs\n",
    "        # time.sleep(0.5)\n",
    "        # env.render()\n",
    "        if done:\n",
    "            break\n",
    "    return total_reward"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "测试函数"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Episode 0: steps = 899 , reward = -2285.0\n",
      "Episode 1: steps = 326 , reward = -623.0\n",
      "Episode 2: steps = 378 , reward = -576.0\n",
      "Episode 3: steps = 530 , reward = -926.0\n",
      "Episode 4: steps = 745 , reward = -1735.0\n",
      "Episode 5: steps = 549 , reward = -846.0\n",
      "Episode 6: steps = 260 , reward = -260.0\n",
      "Episode 7: steps = 420 , reward = -717.0\n",
      "Episode 8: steps = 283 , reward = -382.0\n",
      "Episode 9: steps = 598 , reward = -796.0\n",
      "Episode 10: steps = 387 , reward = -882.0\n",
      "Episode 11: steps = 997 , reward = -2086.0\n",
      "Episode 12: steps = 443 , reward = -740.0\n",
      "Episode 13: steps = 130 , reward = -130.0\n",
      "Episode 14: steps = 295 , reward = -295.0\n",
      "Episode 15: steps = 284 , reward = -581.0\n",
      "Episode 16: steps = 139 , reward = -139.0\n",
      "Episode 17: steps = 337 , reward = -535.0\n",
      "Episode 18: steps = 185 , reward = -284.0\n",
      "Episode 19: steps = 575 , reward = -872.0\n",
      "Episode 20: steps = 284 , reward = -482.0\n",
      "Episode 21: steps = 618 , reward = -1311.0\n",
      "Episode 22: steps = 145 , reward = -145.0\n",
      "Episode 23: steps = 99 , reward = -99.0\n",
      "Episode 24: steps = 403 , reward = -502.0\n",
      "Episode 25: steps = 321 , reward = -420.0\n",
      "Episode 26: steps = 233 , reward = -233.0\n",
      "Episode 27: steps = 218 , reward = -317.0\n",
      "Episode 28: steps = 490 , reward = -787.0\n",
      "Episode 29: steps = 412 , reward = -907.0\n",
      "Episode 30: steps = 142 , reward = -241.0\n",
      "Episode 31: steps = 187 , reward = -187.0\n",
      "Episode 32: steps = 379 , reward = -577.0\n",
      "Episode 33: steps = 101 , reward = -101.0\n",
      "Episode 34: steps = 514 , reward = -1306.0\n",
      "Episode 35: steps = 217 , reward = -415.0\n",
      "Episode 36: steps = 236 , reward = -335.0\n",
      "Episode 37: steps = 267 , reward = -762.0\n",
      "Episode 38: steps = 283 , reward = -382.0\n",
      "Episode 39: steps = 195 , reward = -195.0\n",
      "Episode 40: steps = 152 , reward = -251.0\n",
      "Episode 41: steps = 300 , reward = -399.0\n",
      "Episode 42: steps = 185 , reward = -383.0\n",
      "Episode 43: steps = 107 , reward = -107.0\n",
      "Episode 44: steps = 418 , reward = -715.0\n",
      "Episode 45: steps = 280 , reward = -577.0\n",
      "Episode 46: steps = 104 , reward = -104.0\n",
      "Episode 47: steps = 478 , reward = -973.0\n",
      "Episode 48: steps = 185 , reward = -284.0\n",
      "Episode 49: steps = 126 , reward = -324.0\n",
      "Episode 50: steps = 189 , reward = -189.0\n",
      "Episode 51: steps = 210 , reward = -210.0\n",
      "Episode 52: steps = 152 , reward = -251.0\n",
      "Episode 53: steps = 393 , reward = -690.0\n",
      "Episode 54: steps = 47 , reward = -47.0\n",
      "Episode 55: steps = 146 , reward = -245.0\n",
      "Episode 56: steps = 364 , reward = -760.0\n",
      "Episode 57: steps = 150 , reward = -249.0\n",
      "Episode 58: steps = 123 , reward = -123.0\n",
      "Episode 59: steps = 356 , reward = -851.0\n",
      "Episode 60: steps = 97 , reward = -196.0\n",
      "Episode 61: steps = 236 , reward = -434.0\n",
      "Episode 62: steps = 311 , reward = -707.0\n",
      "Episode 63: steps = 88 , reward = -88.0\n",
      "Episode 64: steps = 97 , reward = -97.0\n",
      "Episode 65: steps = 227 , reward = -524.0\n",
      "Episode 66: steps = 207 , reward = -306.0\n",
      "Episode 67: steps = 190 , reward = -190.0\n",
      "Episode 68: steps = 136 , reward = -136.0\n",
      "Episode 69: steps = 89 , reward = -89.0\n",
      "Episode 70: steps = 176 , reward = -275.0\n",
      "Episode 71: steps = 116 , reward = -215.0\n",
      "Episode 72: steps = 83 , reward = -83.0\n",
      "Episode 73: steps = 350 , reward = -548.0\n",
      "Episode 74: steps = 369 , reward = -864.0\n",
      "Episode 75: steps = 149 , reward = -446.0\n",
      "Episode 76: steps = 284 , reward = -482.0\n",
      "Episode 77: steps = 214 , reward = -313.0\n",
      "Episode 78: steps = 106 , reward = -205.0\n",
      "Episode 79: steps = 81 , reward = -81.0\n",
      "Episode 80: steps = 231 , reward = -330.0\n",
      "Episode 81: steps = 218 , reward = -416.0\n",
      "Episode 82: steps = 221 , reward = -320.0\n",
      "Episode 83: steps = 224 , reward = -521.0\n",
      "Episode 84: steps = 97 , reward = -97.0\n",
      "Episode 85: steps = 135 , reward = -135.0\n",
      "Episode 86: steps = 120 , reward = -318.0\n",
      "Episode 87: steps = 84 , reward = -84.0\n",
      "Episode 88: steps = 154 , reward = -253.0\n",
      "Episode 89: steps = 224 , reward = -422.0\n",
      "Episode 90: steps = 127 , reward = -226.0\n",
      "Episode 91: steps = 342 , reward = -837.0\n",
      "Episode 92: steps = 38 , reward = -38.0\n",
      "Episode 93: steps = 99 , reward = -198.0\n",
      "Episode 94: steps = 236 , reward = -434.0\n",
      "Episode 95: steps = 129 , reward = -228.0\n",
      "Episode 96: steps = 77 , reward = -77.0\n",
      "Episode 97: steps = 213 , reward = -609.0\n",
      "Episode 98: steps = 133 , reward = -133.0\n",
      "Episode 99: steps = 102 , reward = -102.0\n",
      "Episode 100: steps = 88 , reward = -88.0\n",
      "Episode 101: steps = 165 , reward = -462.0\n",
      "Episode 102: steps = 328 , reward = -922.0\n",
      "Episode 103: steps = 161 , reward = -260.0\n",
      "Episode 104: steps = 122 , reward = -122.0\n",
      "Episode 105: steps = 139 , reward = -337.0\n",
      "Episode 106: steps = 128 , reward = -227.0\n",
      "Episode 107: steps = 144 , reward = -144.0\n",
      "Episode 108: steps = 247 , reward = -346.0\n",
      "Episode 109: steps = 215 , reward = -710.0\n",
      "Episode 110: steps = 102 , reward = -102.0\n",
      "Episode 111: steps = 56 , reward = -56.0\n",
      "Episode 112: steps = 278 , reward = -476.0\n",
      "Episode 113: steps = 62 , reward = -161.0\n",
      "Episode 114: steps = 78 , reward = -78.0\n",
      "Episode 115: steps = 141 , reward = -240.0\n",
      "Episode 116: steps = 286 , reward = -583.0\n",
      "Episode 117: steps = 129 , reward = -129.0\n",
      "Episode 118: steps = 116 , reward = -215.0\n",
      "Episode 119: steps = 43 , reward = -43.0\n",
      "Episode 120: steps = 155 , reward = -155.0\n",
      "Episode 121: steps = 85 , reward = -85.0\n",
      "Episode 122: steps = 82 , reward = -82.0\n",
      "Episode 123: steps = 116 , reward = -116.0\n",
      "Episode 124: steps = 163 , reward = -262.0\n",
      "Episode 125: steps = 145 , reward = -442.0\n",
      "Episode 126: steps = 167 , reward = -266.0\n",
      "Episode 127: steps = 62 , reward = -62.0\n",
      "Episode 128: steps = 114 , reward = -213.0\n",
      "Episode 129: steps = 211 , reward = -409.0\n",
      "Episode 130: steps = 191 , reward = -389.0\n",
      "Episode 131: steps = 107 , reward = -107.0\n",
      "Episode 132: steps = 87 , reward = -186.0\n",
      "Episode 133: steps = 93 , reward = -93.0\n",
      "Episode 134: steps = 128 , reward = -128.0\n",
      "Episode 135: steps = 72 , reward = -72.0\n",
      "Episode 136: steps = 222 , reward = -420.0\n",
      "Episode 137: steps = 153 , reward = -252.0\n",
      "Episode 138: steps = 101 , reward = -200.0\n",
      "Episode 139: steps = 189 , reward = -288.0\n",
      "Episode 140: steps = 119 , reward = -218.0\n",
      "Episode 141: steps = 90 , reward = -90.0\n",
      "Episode 142: steps = 83 , reward = -83.0\n",
      "Episode 143: steps = 191 , reward = -191.0\n",
      "Episode 144: steps = 45 , reward = -45.0\n",
      "Episode 145: steps = 73 , reward = -73.0\n",
      "Episode 146: steps = 99 , reward = -99.0\n",
      "Episode 147: steps = 101 , reward = -101.0\n",
      "Episode 148: steps = 75 , reward = -75.0\n",
      "Episode 149: steps = 62 , reward = -62.0\n",
      "Episode 150: steps = 250 , reward = -547.0\n",
      "Episode 151: steps = 139 , reward = -238.0\n",
      "Episode 152: steps = 148 , reward = -247.0\n",
      "Episode 153: steps = 135 , reward = -333.0\n",
      "Episode 154: steps = 76 , reward = -76.0\n",
      "Episode 155: steps = 115 , reward = -214.0\n",
      "Episode 156: steps = 204 , reward = -402.0\n",
      "Episode 157: steps = 128 , reward = -227.0\n",
      "Episode 158: steps = 71 , reward = -71.0\n",
      "Episode 159: steps = 103 , reward = -202.0\n",
      "Episode 160: steps = 75 , reward = -75.0\n",
      "Episode 161: steps = 66 , reward = -66.0\n",
      "Episode 162: steps = 93 , reward = -93.0\n",
      "Episode 163: steps = 73 , reward = -73.0\n",
      "Episode 164: steps = 81 , reward = -81.0\n",
      "Episode 165: steps = 96 , reward = -96.0\n",
      "Episode 166: steps = 96 , reward = -96.0\n",
      "Episode 167: steps = 74 , reward = -74.0\n",
      "Episode 168: steps = 72 , reward = -171.0\n",
      "Episode 169: steps = 172 , reward = -370.0\n",
      "Episode 170: steps = 84 , reward = -183.0\n",
      "Episode 171: steps = 135 , reward = -135.0\n",
      "Episode 172: steps = 120 , reward = -219.0\n",
      "Episode 173: steps = 70 , reward = -70.0\n",
      "Episode 174: steps = 117 , reward = -117.0\n",
      "Episode 175: steps = 89 , reward = -89.0\n",
      "Episode 176: steps = 56 , reward = -56.0\n",
      "Episode 177: steps = 82 , reward = -82.0\n",
      "Episode 178: steps = 113 , reward = -113.0\n",
      "Episode 179: steps = 68 , reward = -68.0\n",
      "Episode 180: steps = 93 , reward = -192.0\n",
      "Episode 181: steps = 112 , reward = -112.0\n",
      "Episode 182: steps = 121 , reward = -121.0\n",
      "Episode 183: steps = 63 , reward = -63.0\n",
      "Episode 184: steps = 59 , reward = -59.0\n",
      "Episode 185: steps = 119 , reward = -218.0\n",
      "Episode 186: steps = 146 , reward = -344.0\n",
      "Episode 187: steps = 61 , reward = -61.0\n",
      "Episode 188: steps = 75 , reward = -273.0\n",
      "Episode 189: steps = 110 , reward = -110.0\n",
      "Episode 190: steps = 33 , reward = -33.0\n",
      "Episode 191: steps = 97 , reward = -196.0\n",
      "Episode 192: steps = 70 , reward = -70.0\n",
      "Episode 193: steps = 276 , reward = -375.0\n",
      "Episode 194: steps = 63 , reward = -162.0\n",
      "Episode 195: steps = 129 , reward = -228.0\n",
      "Episode 196: steps = 70 , reward = -70.0\n",
      "Episode 197: steps = 157 , reward = -256.0\n",
      "Episode 198: steps = 77 , reward = -275.0\n",
      "Episode 199: steps = 143 , reward = -242.0\n",
      "Episode 200: steps = 76 , reward = -76.0\n",
      "Episode 201: steps = 66 , reward = -66.0\n",
      "Episode 202: steps = 80 , reward = -80.0\n",
      "Episode 203: steps = 96 , reward = -96.0\n",
      "Episode 204: steps = 106 , reward = -205.0\n",
      "Episode 205: steps = 112 , reward = -211.0\n",
      "Episode 206: steps = 201 , reward = -696.0\n",
      "Episode 207: steps = 102 , reward = -201.0\n",
      "Episode 208: steps = 68 , reward = -68.0\n",
      "Episode 209: steps = 100 , reward = -100.0\n",
      "Episode 210: steps = 74 , reward = -74.0\n",
      "Episode 211: steps = 119 , reward = -218.0\n",
      "Episode 212: steps = 66 , reward = -66.0\n",
      "Episode 213: steps = 178 , reward = -376.0\n",
      "Episode 214: steps = 108 , reward = -207.0\n",
      "Episode 215: steps = 56 , reward = -56.0\n",
      "Episode 216: steps = 59 , reward = -158.0\n",
      "Episode 217: steps = 126 , reward = -126.0\n",
      "Episode 218: steps = 119 , reward = -218.0\n",
      "Episode 219: steps = 90 , reward = -189.0\n",
      "Episode 220: steps = 52 , reward = -52.0\n",
      "Episode 221: steps = 95 , reward = -95.0\n",
      "Episode 222: steps = 73 , reward = -172.0\n",
      "Episode 223: steps = 124 , reward = -223.0\n",
      "Episode 224: steps = 32 , reward = -32.0\n",
      "Episode 225: steps = 167 , reward = -365.0\n",
      "Episode 226: steps = 97 , reward = -97.0\n",
      "Episode 227: steps = 42 , reward = -42.0\n",
      "Episode 228: steps = 96 , reward = -96.0\n",
      "Episode 229: steps = 59 , reward = -59.0\n",
      "Episode 230: steps = 130 , reward = -229.0\n",
      "Episode 231: steps = 40 , reward = -40.0\n",
      "Episode 232: steps = 78 , reward = -78.0\n",
      "Episode 233: steps = 66 , reward = -66.0\n",
      "Episode 234: steps = 243 , reward = -441.0\n",
      "Episode 235: steps = 61 , reward = -61.0\n",
      "Episode 236: steps = 27 , reward = -27.0\n",
      "Episode 237: steps = 162 , reward = -459.0\n",
      "Episode 238: steps = 93 , reward = -93.0\n",
      "Episode 239: steps = 62 , reward = -62.0\n",
      "Episode 240: steps = 55 , reward = -55.0\n",
      "Episode 241: steps = 78 , reward = -78.0\n",
      "Episode 242: steps = 142 , reward = -340.0\n",
      "Episode 243: steps = 63 , reward = -63.0\n",
      "Episode 244: steps = 156 , reward = -255.0\n",
      "Episode 245: steps = 40 , reward = -40.0\n",
      "Episode 246: steps = 162 , reward = -360.0\n",
      "Episode 247: steps = 39 , reward = -39.0\n",
      "Episode 248: steps = 153 , reward = -252.0\n",
      "Episode 249: steps = 116 , reward = -215.0\n",
      "Episode 250: steps = 46 , reward = -46.0\n",
      "Episode 251: steps = 109 , reward = -208.0\n",
      "Episode 252: steps = 100 , reward = -199.0\n",
      "Episode 253: steps = 208 , reward = -505.0\n",
      "Episode 254: steps = 121 , reward = -121.0\n",
      "Episode 255: steps = 21 , reward = -21.0\n",
      "Episode 256: steps = 43 , reward = -142.0\n",
      "Episode 257: steps = 73 , reward = -73.0\n",
      "Episode 258: steps = 62 , reward = -62.0\n",
      "Episode 259: steps = 65 , reward = -65.0\n",
      "Episode 260: steps = 118 , reward = -118.0\n",
      "Episode 261: steps = 46 , reward = -46.0\n",
      "Episode 262: steps = 91 , reward = -190.0\n",
      "Episode 263: steps = 63 , reward = -63.0\n",
      "Episode 264: steps = 140 , reward = -239.0\n",
      "Episode 265: steps = 22 , reward = -22.0\n",
      "Episode 266: steps = 113 , reward = -212.0\n",
      "Episode 267: steps = 74 , reward = -74.0\n",
      "Episode 268: steps = 35 , reward = -35.0\n",
      "Episode 269: steps = 169 , reward = -268.0\n",
      "Episode 270: steps = 56 , reward = -56.0\n",
      "Episode 271: steps = 29 , reward = -29.0\n",
      "Episode 272: steps = 253 , reward = -649.0\n",
      "Episode 273: steps = 60 , reward = -60.0\n",
      "Episode 274: steps = 82 , reward = -82.0\n",
      "Episode 275: steps = 76 , reward = -76.0\n",
      "Episode 276: steps = 78 , reward = -78.0\n",
      "Episode 277: steps = 62 , reward = -62.0\n",
      "Episode 278: steps = 65 , reward = -65.0\n",
      "Episode 279: steps = 71 , reward = -71.0\n",
      "Episode 280: steps = 88 , reward = -88.0\n",
      "Episode 281: steps = 122 , reward = -320.0\n",
      "Episode 282: steps = 50 , reward = -50.0\n",
      "Episode 283: steps = 102 , reward = -102.0\n",
      "Episode 284: steps = 38 , reward = -38.0\n",
      "Episode 285: steps = 102 , reward = -201.0\n",
      "Episode 286: steps = 89 , reward = -89.0\n",
      "Episode 287: steps = 85 , reward = -85.0\n",
      "Episode 288: steps = 40 , reward = -40.0\n",
      "Episode 289: steps = 103 , reward = -202.0\n",
      "Episode 290: steps = 96 , reward = -195.0\n",
      "Episode 291: steps = 39 , reward = -39.0\n",
      "Episode 292: steps = 80 , reward = -179.0\n",
      "Episode 293: steps = 71 , reward = -71.0\n",
      "Episode 294: steps = 32 , reward = -131.0\n",
      "Episode 295: steps = 160 , reward = -358.0\n",
      "Episode 296: steps = 124 , reward = -421.0\n",
      "Episode 297: steps = 44 , reward = -44.0\n",
      "Episode 298: steps = 52 , reward = -52.0\n",
      "Episode 299: steps = 84 , reward = -84.0\n",
      "Episode 300: steps = 48 , reward = -48.0\n",
      "Episode 301: steps = 109 , reward = -208.0\n",
      "Episode 302: steps = 89 , reward = -89.0\n",
      "Episode 303: steps = 56 , reward = -56.0\n",
      "Episode 304: steps = 49 , reward = -49.0\n",
      "Episode 305: steps = 76 , reward = -175.0\n",
      "Episode 306: steps = 51 , reward = -150.0\n",
      "Episode 307: steps = 63 , reward = -162.0\n",
      "Episode 308: steps = 95 , reward = -95.0\n",
      "Episode 309: steps = 79 , reward = -277.0\n",
      "Episode 310: steps = 126 , reward = -225.0\n",
      "Episode 311: steps = 54 , reward = -54.0\n",
      "Episode 312: steps = 88 , reward = -187.0\n",
      "Episode 313: steps = 72 , reward = -72.0\n",
      "Episode 314: steps = 62 , reward = -62.0\n",
      "Episode 315: steps = 42 , reward = -42.0\n",
      "Episode 316: steps = 124 , reward = -124.0\n",
      "Episode 317: steps = 38 , reward = -38.0\n",
      "Episode 318: steps = 48 , reward = -48.0\n",
      "Episode 319: steps = 107 , reward = -206.0\n",
      "Episode 320: steps = 76 , reward = -76.0\n",
      "Episode 321: steps = 44 , reward = -44.0\n",
      "Episode 322: steps = 92 , reward = -92.0\n",
      "Episode 323: steps = 68 , reward = -167.0\n",
      "Episode 324: steps = 65 , reward = -65.0\n",
      "Episode 325: steps = 81 , reward = -81.0\n",
      "Episode 326: steps = 56 , reward = -56.0\n",
      "Episode 327: steps = 74 , reward = -74.0\n",
      "Episode 328: steps = 67 , reward = -67.0\n",
      "Episode 329: steps = 82 , reward = -82.0\n",
      "Episode 330: steps = 52 , reward = -52.0\n",
      "Episode 331: steps = 63 , reward = -261.0\n",
      "Episode 332: steps = 42 , reward = -42.0\n",
      "Episode 333: steps = 155 , reward = -452.0\n",
      "Episode 334: steps = 55 , reward = -55.0\n",
      "Episode 335: steps = 43 , reward = -43.0\n",
      "Episode 336: steps = 108 , reward = -207.0\n",
      "Episode 337: steps = 74 , reward = -173.0\n",
      "Episode 338: steps = 50 , reward = -50.0\n",
      "Episode 339: steps = 50 , reward = -50.0\n",
      "Episode 340: steps = 49 , reward = -49.0\n",
      "Episode 341: steps = 60 , reward = -60.0\n",
      "Episode 342: steps = 65 , reward = -164.0\n",
      "Episode 343: steps = 72 , reward = -171.0\n",
      "Episode 344: steps = 107 , reward = -206.0\n",
      "Episode 345: steps = 70 , reward = -169.0\n",
      "Episode 346: steps = 88 , reward = -88.0\n",
      "Episode 347: steps = 52 , reward = -52.0\n",
      "Episode 348: steps = 54 , reward = -54.0\n",
      "Episode 349: steps = 70 , reward = -70.0\n",
      "Episode 350: steps = 39 , reward = -39.0\n",
      "Episode 351: steps = 60 , reward = -60.0\n",
      "Episode 352: steps = 53 , reward = -53.0\n",
      "Episode 353: steps = 143 , reward = -242.0\n",
      "Episode 354: steps = 59 , reward = -59.0\n",
      "Episode 355: steps = 67 , reward = -67.0\n",
      "Episode 356: steps = 39 , reward = -39.0\n",
      "Episode 357: steps = 78 , reward = -78.0\n",
      "Episode 358: steps = 53 , reward = -53.0\n",
      "Episode 359: steps = 47 , reward = -47.0\n",
      "Episode 360: steps = 72 , reward = -72.0\n",
      "Episode 361: steps = 60 , reward = -60.0\n",
      "Episode 362: steps = 32 , reward = -32.0\n",
      "Episode 363: steps = 200 , reward = -398.0\n",
      "Episode 364: steps = 139 , reward = -436.0\n",
      "Episode 365: steps = 93 , reward = -192.0\n",
      "Episode 366: steps = 52 , reward = -52.0\n",
      "Episode 367: steps = 137 , reward = -236.0\n",
      "Episode 368: steps = 46 , reward = -46.0\n",
      "Episode 369: steps = 53 , reward = -53.0\n",
      "Episode 370: steps = 72 , reward = -72.0\n",
      "Episode 371: steps = 57 , reward = -57.0\n",
      "Episode 372: steps = 71 , reward = -170.0\n",
      "Episode 373: steps = 48 , reward = -48.0\n",
      "Episode 374: steps = 93 , reward = -192.0\n",
      "Episode 375: steps = 46 , reward = -145.0\n",
      "Episode 376: steps = 55 , reward = -55.0\n",
      "Episode 377: steps = 67 , reward = -67.0\n",
      "Episode 378: steps = 53 , reward = -53.0\n",
      "Episode 379: steps = 65 , reward = -65.0\n",
      "Episode 380: steps = 70 , reward = -70.0\n",
      "Episode 381: steps = 70 , reward = -169.0\n",
      "Episode 382: steps = 42 , reward = -42.0\n",
      "Episode 383: steps = 79 , reward = -79.0\n",
      "Episode 384: steps = 83 , reward = -182.0\n",
      "Episode 385: steps = 38 , reward = -38.0\n",
      "Episode 386: steps = 64 , reward = -64.0\n",
      "Episode 387: steps = 64 , reward = -64.0\n",
      "Episode 388: steps = 41 , reward = -41.0\n",
      "Episode 389: steps = 93 , reward = -192.0\n",
      "Episode 390: steps = 73 , reward = -73.0\n",
      "Episode 391: steps = 35 , reward = -35.0\n",
      "Episode 392: steps = 63 , reward = -162.0\n",
      "Episode 393: steps = 64 , reward = -64.0\n",
      "Episode 394: steps = 65 , reward = -164.0\n",
      "Episode 395: steps = 85 , reward = -283.0\n",
      "Episode 396: steps = 43 , reward = -43.0\n",
      "Episode 397: steps = 76 , reward = -76.0\n",
      "Episode 398: steps = 83 , reward = -182.0\n",
      "Episode 399: steps = 44 , reward = -44.0\n",
      "Episode 400: steps = 64 , reward = -64.0\n",
      "Episode 401: steps = 22 , reward = -22.0\n",
      "Episode 402: steps = 58 , reward = -157.0\n",
      "Episode 403: steps = 79 , reward = -79.0\n",
      "Episode 404: steps = 48 , reward = -147.0\n",
      "Episode 405: steps = 45 , reward = -45.0\n",
      "Episode 406: steps = 62 , reward = -62.0\n",
      "Episode 407: steps = 72 , reward = -72.0\n",
      "Episode 408: steps = 57 , reward = -57.0\n",
      "Episode 409: steps = 48 , reward = -48.0\n",
      "Episode 410: steps = 66 , reward = -66.0\n",
      "Episode 411: steps = 34 , reward = -34.0\n",
      "Episode 412: steps = 48 , reward = -48.0\n",
      "Episode 413: steps = 83 , reward = -83.0\n",
      "Episode 414: steps = 26 , reward = -26.0\n",
      "Episode 415: steps = 88 , reward = -88.0\n",
      "Episode 416: steps = 29 , reward = -29.0\n",
      "Episode 417: steps = 103 , reward = -301.0\n",
      "Episode 418: steps = 47 , reward = -146.0\n",
      "Episode 419: steps = 68 , reward = -266.0\n",
      "Episode 420: steps = 74 , reward = -74.0\n",
      "Episode 421: steps = 77 , reward = -77.0\n",
      "Episode 422: steps = 41 , reward = -41.0\n",
      "Episode 423: steps = 106 , reward = -304.0\n",
      "Episode 424: steps = 34 , reward = -34.0\n",
      "Episode 425: steps = 59 , reward = -59.0\n",
      "Episode 426: steps = 26 , reward = -26.0\n",
      "Episode 427: steps = 93 , reward = -93.0\n",
      "Episode 428: steps = 59 , reward = -59.0\n",
      "Episode 429: steps = 105 , reward = -204.0\n",
      "Episode 430: steps = 25 , reward = -25.0\n",
      "Episode 431: steps = 54 , reward = -54.0\n",
      "Episode 432: steps = 98 , reward = -197.0\n",
      "Episode 433: steps = 52 , reward = -52.0\n",
      "Episode 434: steps = 47 , reward = -47.0\n",
      "Episode 435: steps = 58 , reward = -58.0\n",
      "Episode 436: steps = 45 , reward = -45.0\n",
      "Episode 437: steps = 55 , reward = -55.0\n",
      "Episode 438: steps = 89 , reward = -188.0\n",
      "Episode 439: steps = 130 , reward = -328.0\n",
      "Episode 440: steps = 66 , reward = -66.0\n",
      "Episode 441: steps = 81 , reward = -81.0\n",
      "Episode 442: steps = 38 , reward = -38.0\n",
      "Episode 443: steps = 41 , reward = -41.0\n",
      "Episode 444: steps = 42 , reward = -42.0\n",
      "Episode 445: steps = 103 , reward = -202.0\n",
      "Episode 446: steps = 40 , reward = -40.0\n",
      "Episode 447: steps = 60 , reward = -60.0\n",
      "Episode 448: steps = 138 , reward = -336.0\n",
      "Episode 449: steps = 36 , reward = -36.0\n",
      "Episode 450: steps = 67 , reward = -67.0\n",
      "Episode 451: steps = 96 , reward = -195.0\n",
      "Episode 452: steps = 56 , reward = -56.0\n",
      "Episode 453: steps = 66 , reward = -165.0\n",
      "Episode 454: steps = 34 , reward = -34.0\n",
      "Episode 455: steps = 69 , reward = -168.0\n",
      "Episode 456: steps = 53 , reward = -53.0\n",
      "Episode 457: steps = 48 , reward = -48.0\n",
      "Episode 458: steps = 44 , reward = -44.0\n",
      "Episode 459: steps = 47 , reward = -47.0\n",
      "Episode 460: steps = 61 , reward = -61.0\n",
      "Episode 461: steps = 70 , reward = -70.0\n",
      "Episode 462: steps = 39 , reward = -39.0\n",
      "Episode 463: steps = 34 , reward = -34.0\n",
      "Episode 464: steps = 95 , reward = -392.0\n",
      "Episode 465: steps = 44 , reward = -44.0\n",
      "Episode 466: steps = 51 , reward = -150.0\n",
      "Episode 467: steps = 121 , reward = -319.0\n",
      "Episode 468: steps = 42 , reward = -42.0\n",
      "Episode 469: steps = 102 , reward = -201.0\n",
      "Episode 470: steps = 33 , reward = -33.0\n",
      "Episode 471: steps = 75 , reward = -174.0\n",
      "Episode 472: steps = 63 , reward = -162.0\n",
      "Episode 473: steps = 87 , reward = -186.0\n",
      "Episode 474: steps = 43 , reward = -43.0\n",
      "Episode 475: steps = 61 , reward = -160.0\n",
      "Episode 476: steps = 55 , reward = -55.0\n",
      "Episode 477: steps = 108 , reward = -207.0\n",
      "Episode 478: steps = 43 , reward = -43.0\n",
      "Episode 479: steps = 51 , reward = -51.0\n",
      "Episode 480: steps = 60 , reward = -60.0\n",
      "Episode 481: steps = 44 , reward = -44.0\n",
      "Episode 482: steps = 32 , reward = -32.0\n",
      "Episode 483: steps = 108 , reward = -108.0\n",
      "Episode 484: steps = 23 , reward = -23.0\n",
      "Episode 485: steps = 39 , reward = -39.0\n",
      "Episode 486: steps = 47 , reward = -47.0\n",
      "Episode 487: steps = 45 , reward = -45.0\n",
      "Episode 488: steps = 47 , reward = -47.0\n",
      "Episode 489: steps = 45 , reward = -45.0\n",
      "Episode 490: steps = 76 , reward = -76.0\n",
      "Episode 491: steps = 49 , reward = -49.0\n",
      "Episode 492: steps = 70 , reward = -169.0\n",
      "Episode 493: steps = 60 , reward = -159.0\n",
      "Episode 494: steps = 85 , reward = -85.0\n",
      "Episode 495: steps = 31 , reward = -31.0\n",
      "Episode 496: steps = 52 , reward = -52.0\n",
      "Episode 497: steps = 54 , reward = -54.0\n",
      "Episode 498: steps = 50 , reward = -50.0\n",
      "Episode 499: steps = 61 , reward = -61.0\n"
     ]
    }
   ],
   "source": [
    "#创建悬崖环境\n",
    "env = CliffWalkingEnv()  # 0 up, 1 right, 2 down, 3 left\n",
    "# 创建qlearning实例，设置各种超参数\n",
    "agent = qlearningAgent(\n",
    "    obs_n=env.observation_space.n,\n",
    "    act_n=env.action_space.n,\n",
    "    learning_rate=0.01,\n",
    "    gamma=0.99,\n",
    "    e_greed=0.1)\n",
    "\n",
    "\n",
    "# 训练500个episode，打印每个episode的分数\n",
    "for episode in range(500):\n",
    "    ep_reward, ep_steps = run_episode(env, agent, False)\n",
    "    print('Episode %s: steps = %s , reward = %.1f' % (episode, ep_steps, ep_reward))\n",
    "\n",
    "# 查看算法效果\n",
    "test_reward = test_episode(env, agent)\n",
    "print('test reward = %.1f' % (test_reward))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "创建环境，设置超参数，得到结果"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "interpreter": {
   "hash": "7648c2b9d25760d0d65f53f9b9a34de48caa24d8265d64b0ff81e2f2641d528d"
  },
  "kernelspec": {
   "display_name": "pytorch",
   "language": "python",
   "name": "pytorch"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.0"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
